by Gasm Elbary Mohamed
Task: create a dashboard to share the findings with the management.
The dashboard should include:
To view the dataset and check if there needs any cleaning.
import matplotlib.pyplot as plt
import plotly.express as px
import folium
import json
from folium import plugins
import squarify
import pandas as pd
import seaborn as sns
from datetime import date
%matplotlib inline
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.colors as mcolors
import warnings
warnings.filterwarnings("ignore")
sns.set_theme()
sns.set(rc={'figure.figsize':(9,8)})
# Read the excel file
df = pd.read_excel('Employee_Salaries.xlsx')
# View dataset
df.head()
| Full Name | Gender | Current Annual Salary | 2017 Gross Pay Received | 2017 Overtime Pay | Department | Department Name | Division | Assignment Category | Employee Position Title | Position Under-Filled | Date First Hired | Zip | Zip Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Employee1 | F | 70959.79 | 71316.72 | 0.00 | POL | Department of Police | MSB Information Mgmt and Tech Division Records... | Fulltime-Regular | Office Services Coordinator | NaN | 1986-09-22 | 20782 | Chillum |
| 1 | Employee2 | F | 110359.00 | 108040.82 | 0.00 | HHS | Department of Health and Human Services | Adult Protective and Case Management Services | Fulltime-Regular | Supervisory Social Worker | NaN | 1989-11-19 | 20705 | Beltsville |
| 2 | Employee3 | M | 55950.24 | 62575.19 | 7649.19 | COR | Correction and Rehabilitation | PRRS Facility and Security | Fulltime-Regular | Resident Supervisor II | NaN | 2014-05-05 | 20781 | Hyattsville |
| 3 | Employee4 | M | 95740.00 | 96055.94 | 0.00 | HCA | Department of Housing and Community Affairs | Affordable Housing Programs | Fulltime-Regular | Planning Specialist III | NaN | 2007-03-05 | 20707 | Laurel |
| 4 | Employee5 | M | 74732.00 | 98736.78 | 23468.73 | POL | Department of Police | PSB 6th District Special Assignment Team | Fulltime-Regular | Police Officer III | NaN | 2007-07-16 | 20705 | Beltsville |
# Print shape and dataset information
print("Data shape: ", df.shape)
print(df.info())
Data shape: (9398, 14) <class 'pandas.core.frame.DataFrame'> RangeIndex: 9398 entries, 0 to 9397 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Full Name 9398 non-null object 1 Gender 9398 non-null object 2 Current Annual Salary 9398 non-null float64 3 2017 Gross Pay Received 9398 non-null float64 4 2017 Overtime Pay 9398 non-null float64 5 Department 9398 non-null object 6 Department Name 9398 non-null object 7 Division 9398 non-null object 8 Assignment Category 9398 non-null object 9 Employee Position Title 9398 non-null object 10 Position Under-Filled 1088 non-null object 11 Date First Hired 9398 non-null datetime64[ns] 12 Zip 9398 non-null int64 13 Zip Name 9398 non-null object dtypes: datetime64[ns](1), float64(3), int64(1), object(9) memory usage: 1.0+ MB None
# Check missing values
df.isna().sum()
Full Name 0 Gender 0 Current Annual Salary 0 2017 Gross Pay Received 0 2017 Overtime Pay 0 Department 0 Department Name 0 Division 0 Assignment Category 0 Employee Position Title 0 Position Under-Filled 8310 Date First Hired 0 Zip 0 Zip Name 0 dtype: int64
There are missing values in the Position Under-Filled column only, and that could be explained by saying that means the prespective emplyees have no position under-filled. Therefore, there are no missing data in columns of interest to carry out this task.
The data needs no cleaning, and we can carry out the visualization task.
By checking the names of the cities on the dataset, I could conclude that the data is from Maryland, USA. To draw the gross pay on a map, I need to know the coordination of the available zip codes; and that is usually available in a GeoJson file. Searching online, I could find a GeoJson of Maryland on the following link.
# Group-by Zip code then find the total gross pay
df_zip_totGrossPay = df.groupby(["Zip"])[["2017 Gross Pay Received"]].sum().reset_index()
# I convert the Zip code to string to match the zipcodes in the GeoJson.
# this is easier and faster than converting the GeoJson zipcodes to strings.
df_zip_totGrossPay['Zip'] = df_zip_totGrossPay['Zip'].astype(str)
with open('maryland_zips.geojson') as jsonFile:
data = json.load(jsonFile)
tmp = data
# remove ZIP codes not in our dataset
geozips = []
present_zip = list(df_zip_totGrossPay["Zip"])
for i in range(len(tmp['features'])):
if tmp['features'][i]['properties']['name'] in present_zip:
geozips.append(tmp['features'][i])
#Check if all zip codes have been found
len(geozips) == df_zip_totGrossPay.shape[0]
True
# creating new JSON object
new_json = dict.fromkeys(['type','features'])
new_json['type'] = 'FeatureCollection'
new_json['features'] = geozips
# save JSON object as Zips_of_interest
open("Zips_of_interest.json", "w").write(
json.dumps(new_json, sort_keys=True, indent=4, separators=(',', ': '))
)
145873
# 39.045753 and -76.641273 are the coordinations of Maryland
m = folium.Map(location=[39.045753, -76.641273], default_zoom_start=13)
folium.Choropleth(geo_data="Zips_of_interest.json",
name = 'Total Gross Pay by Zip Code',
data=df_zip_totGrossPay,
columns=['Zip', '2017 Gross Pay Received'],
key_on='feature.properties.name',
fill_opacity=0.7,line_opacity=0.2,
legend_name='Scaled 2017 Gross Pay Received').add_to(m)
folium.LayerControl().add_to(m)
m.save(outfile = 'Total Gross Pay by Zip Code Map.html')
m
# Group-by gender then sum gross pay
ax = df.groupby(["Gender"])["2017 Gross Pay Received"].sum().plot.bar(title= '2017 Total Gross Pay Received by Gender',
color = ['pink', 'silver'], xlabel = 'Gender',
ylabel = 'Total Gross Pay',figsize=(8,5),rot=0)
# Group-by gender then find the average gross pay by gender
ax = df.groupby(["Gender"])["2017 Gross Pay Received"].mean().plot.bar(title= '2017 Average Gross Pay Received by Gender',
color = ['pink', 'silver'], xlabel = 'Gender',
ylabel = 'Average Gross Pay',figsize=(8,5),rot=0)
# Assuming the dataset is new and the employees are still working, I'll find the years hired up to the current date.
today_date = date.today()
df['years_hired'] = (np.datetime64(today_date) - df['Date First Hired']) / np.timedelta64(1, 'Y')
# As the task requires plotting a bar chart, it is better to convert the year difference to integer.
df['years_hired'] = df['years_hired'].astype(int)
#df.head(3)
# Group-by years being hired then sum gross pay
ax = df.groupby(["years_hired"])["2017 Gross Pay Received"].sum().plot.bar(title= '2017 Total Gross Pay Received by Years Hired',
color = 'lightsalmon', xlabel = 'Years Hired',
ylabel = 'Total Gross Pay',figsize=(16,5),rot=0)
# Group-by years being hired then find the average
ax = df.groupby(["years_hired"])["2017 Gross Pay Received"].mean().plot.bar(title= '2017 Average Gross Pay Received by Years Hired',
color = 'lightsalmon', xlabel = 'Years Hired',
ylabel = 'Average Gross Pay',figsize=(16,5),rot=0)
# Using the total overtime per department
df_overtime_dipt_sum = df.groupby(["Department Name"])[["2017 Overtime Pay"]].sum()
# Don't include departments with total overtime pay of 0
df_overtime_dipt_sum = df_overtime_dipt_sum[df_overtime_dipt_sum['2017 Overtime Pay'] > 0]
# Rename the column to a more convenient name that will appear when hovering over a rectangle.
df_overtime_dipt_sum.rename(columns = {'2017 Overtime Pay':'Total Overtime Pay'}, inplace = True)
# Plot the tree map
fig = px.treemap(df_overtime_dipt_sum.reset_index(), path=['Department Name'],
values='Total Overtime Pay', width=1200, height=800,
title="Treemap of Total Overtime Pay and Department name ")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.update_traces(hovertemplate = "Department: %{label}: <br>Total Overtime Pay: %{value}")
fig.show()
# Save the treemap to local device
fig.write_image(file='treemap_total.png', format='png', scale = 10)
fig.write_html(file='treemap_total.html')
# Using the average overtime per department
df_overtime_dipt_average = df.groupby(["Department Name"])[["2017 Overtime Pay"]].mean().round(2)
# Don't include departments with total overtime pay of 0
df_overtime_dipt_average = df_overtime_dipt_average[df_overtime_dipt_average['2017 Overtime Pay'] > 0]
# Rename the column to a more convenient name that will appear when hovering over a rectangle.
df_overtime_dipt_average.rename(columns = {'2017 Overtime Pay':'Average Overtime Pay'}, inplace = True)
fig = px.treemap(df_overtime_dipt_average.reset_index(), path=['Department Name'],
values='Average Overtime Pay',
width=1200,
height=800,
title="Treemap of Average Overtime Pay and Department Name")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.update_traces(hovertemplate = "Department: %{label}: <br>Average Overtime Pay: %{value}")
fig.show()
# Save the treemap to local device
fig.write_image(file='treemap_average.png', format='png', scale = 10)
fig.write_html(file='treemap_average.html')